{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.017006,
     "end_time": "2021-02-19T07:32:16.999516",
     "exception": false,
     "start_time": "2021-02-19T07:32:16.982510",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "In this notebook, I'd like to share how to label encode all wifi files in parallel using dask. On my workstation with 20 cpu cores, it took only 10 mins to encode all the data. Due to the limitation of kaggle kernel, I have to set number of workers to 2. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:17.032789Z",
     "iopub.status.busy": "2021-02-19T07:32:17.031894Z",
     "iopub.status.idle": "2021-02-19T07:32:17.783035Z",
     "shell.execute_reply": "2021-02-19T07:32:17.783614Z"
    },
    "papermill": {
     "duration": 0.769447,
     "end_time": "2021-02-19T07:32:17.783971",
     "exception": false,
     "start_time": "2021-02-19T07:32:17.014524",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "metadata  sample_submission.csv  test  train\r\n"
     ]
    }
   ],
   "source": [
    "! ls ../input/indoor-location-navigation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:17.817174Z",
     "iopub.status.busy": "2021-02-19T07:32:17.816429Z",
     "iopub.status.idle": "2021-02-19T07:32:18.815016Z",
     "shell.execute_reply": "2021-02-19T07:32:18.813025Z"
    },
    "papermill": {
     "duration": 1.016149,
     "end_time": "2021-02-19T07:32:18.815223",
     "exception": false,
     "start_time": "2021-02-19T07:32:17.799074",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import os\n",
    "import sys\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:18.859430Z",
     "iopub.status.busy": "2021-02-19T07:32:18.854138Z",
     "iopub.status.idle": "2021-02-19T07:32:19.455581Z",
     "shell.execute_reply": "2021-02-19T07:32:19.455022Z"
    },
    "papermill": {
     "duration": 0.62535,
     "end_time": "2021-02-19T07:32:19.455739",
     "exception": false,
     "start_time": "2021-02-19T07:32:18.830389",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from glob import glob\n",
    "from dask.distributed import wait\n",
    "\n",
    "SENSORS = ['acce','acce_uncali','gyro',\n",
    "           'gyro_uncali','magn','magn_uncali','ahrs']\n",
    "\n",
    "NFEAS = {\n",
    "    'acce': 3,\n",
    "    'acce_uncali': 3,\n",
    "    'gyro': 3,\n",
    "    'gyro_uncali': 3,\n",
    "    'magn': 3,\n",
    "    'magn_uncali': 3,\n",
    "    'ahrs': 3,\n",
    "    'wifi': 1,\n",
    "    'ibeacon': 1,\n",
    "    'waypoint': 3\n",
    "}\n",
    "\n",
    "ACOLS = ['timestamp','x','y','z']\n",
    "        \n",
    "FIELDS = {\n",
    "    'acce': ACOLS,\n",
    "    'acce_uncali': ACOLS,\n",
    "    'gyro': ACOLS,\n",
    "    'gyro_uncali': ACOLS,\n",
    "    'magn': ACOLS,\n",
    "    'magn_uncali': ACOLS,\n",
    "    'ahrs': ACOLS,\n",
    "    'wifi': ['timestamp','ssid','bssid','rssi','last_timestamp'],\n",
    "    'ibeacon': ['timestamp','code','rssi'],\n",
    "    'waypoint': ['timestamp','x','y']\n",
    "}\n",
    "\n",
    "def to_frame(data, col):\n",
    "    cols = FIELDS[col]\n",
    "    if data.shape[0]>0:\n",
    "        df = pd.DataFrame(data, columns=cols)\n",
    "    else:\n",
    "        df = create_dummy_df(cols)\n",
    "    for col in df.columns:\n",
    "        if 'timestamp' in col:\n",
    "            df[col] = df[col].astype('int64')\n",
    "    return df\n",
    "\n",
    "def create_dummy_df(cols):\n",
    "    df = pd.DataFrame()\n",
    "    for col in cols:\n",
    "        df[col] = [0]\n",
    "        if col in ['ssid','bssid']:\n",
    "            df[col] = df[col].map(str)\n",
    "    return df\n",
    "\n",
    "def get_building_floor(fname):\n",
    "    xx = fname.split('/')\n",
    "    return xx[-3],xx[-2]\n",
    "\n",
    "def get_test_building(name):\n",
    "    with open(name) as f:\n",
    "        for c,line in enumerate(f):\n",
    "            if c==1:\n",
    "                x = line.split()[1].split(':')[1]\n",
    "                return x  \n",
    "\n",
    "def get_floor_target(floor):\n",
    "    floor = floor.lower()\n",
    "    if floor in ['bf','bm']:\n",
    "        return None\n",
    "    elif floor == 'b':\n",
    "        return -1\n",
    "    if floor.startswith('f'):\n",
    "        return int(floor[1])-1\n",
    "    elif floor.endswith('f'):\n",
    "        return int(floor[0])-1\n",
    "    elif floor.startswith('b'):\n",
    "        return -int(floor[1])\n",
    "    elif floor.endswith('b'):\n",
    "        return -int(floor[0])\n",
    "    else:\n",
    "        return None\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:19.505222Z",
     "iopub.status.busy": "2021-02-19T07:32:19.494804Z",
     "iopub.status.idle": "2021-02-19T07:32:19.517069Z",
     "shell.execute_reply": "2021-02-19T07:32:19.516496Z"
    },
    "papermill": {
     "duration": 0.04658,
     "end_time": "2021-02-19T07:32:19.517227",
     "exception": false,
     "start_time": "2021-02-19T07:32:19.470647",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "@dataclass\n",
    "class ReadData:\n",
    "    acce: np.ndarray\n",
    "    acce_uncali: np.ndarray\n",
    "    gyro: np.ndarray\n",
    "    gyro_uncali: np.ndarray\n",
    "    magn: np.ndarray\n",
    "    magn_uncali: np.ndarray\n",
    "    ahrs: np.ndarray\n",
    "    wifi: np.ndarray\n",
    "    ibeacon: np.ndarray\n",
    "    waypoint: np.ndarray\n",
    "\n",
    "\n",
    "def read_data_file(data_filename):\n",
    "    acce = []\n",
    "    acce_uncali = []\n",
    "    gyro = []\n",
    "    gyro_uncali = []\n",
    "    magn = []\n",
    "    magn_uncali = []\n",
    "    ahrs = []\n",
    "    wifi = []\n",
    "    ibeacon = []\n",
    "    waypoint = []\n",
    "\n",
    "    with open(data_filename, 'r', encoding='utf-8') as file:\n",
    "        lines = file.readlines()\n",
    "\n",
    "    for line_data in lines:\n",
    "        line_data = line_data.strip()\n",
    "        if not line_data or line_data[0] == '#':\n",
    "            continue\n",
    "\n",
    "        line_data = line_data.split('\\t')\n",
    "\n",
    "        if line_data[1] == 'TYPE_ACCELEROMETER':\n",
    "            acce.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
    "            continue\n",
    "\n",
    "        if line_data[1] == 'TYPE_ACCELEROMETER_UNCALIBRATED':\n",
    "            acce_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
    "            continue\n",
    "\n",
    "        if line_data[1] == 'TYPE_GYROSCOPE':\n",
    "            gyro.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
    "            continue\n",
    "\n",
    "        if line_data[1] == 'TYPE_GYROSCOPE_UNCALIBRATED':\n",
    "            gyro_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
    "            continue\n",
    "\n",
    "        if line_data[1] == 'TYPE_MAGNETIC_FIELD':\n",
    "            magn.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
    "            continue\n",
    "\n",
    "        if line_data[1] == 'TYPE_MAGNETIC_FIELD_UNCALIBRATED':\n",
    "            magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
    "            continue\n",
    "\n",
    "        if line_data[1] == 'TYPE_ROTATION_VECTOR':\n",
    "            if len(line_data)>=5:\n",
    "                ahrs.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
    "            continue\n",
    "\n",
    "        if line_data[1] == 'TYPE_WIFI':\n",
    "            sys_ts = line_data[0]\n",
    "            ssid = line_data[2]\n",
    "            bssid = line_data[3]\n",
    "            rssi = line_data[4]\n",
    "            lastseen_ts = line_data[6]\n",
    "            wifi_data = [sys_ts, ssid, bssid, rssi, lastseen_ts]\n",
    "            wifi.append(wifi_data)\n",
    "            continue\n",
    "\n",
    "        if line_data[1] == 'TYPE_BEACON':\n",
    "            ts = line_data[0]\n",
    "            uuid = line_data[2]\n",
    "            major = line_data[3]\n",
    "            minor = line_data[4]\n",
    "            rssi = line_data[6]\n",
    "            ibeacon_data = [ts, '_'.join([uuid, major, minor]), rssi]\n",
    "            ibeacon.append(ibeacon_data)\n",
    "            continue\n",
    "\n",
    "        if line_data[1] == 'TYPE_WAYPOINT':\n",
    "            waypoint.append([int(line_data[0]), float(line_data[2]), float(line_data[3])])\n",
    "\n",
    "    acce = np.array(acce)\n",
    "    acce_uncali = np.array(acce_uncali)\n",
    "    gyro = np.array(gyro)\n",
    "    gyro_uncali = np.array(gyro_uncali)\n",
    "    magn = np.array(magn)\n",
    "    magn_uncali = np.array(magn_uncali)\n",
    "    ahrs = np.array(ahrs)\n",
    "    wifi = np.array(wifi)\n",
    "    ibeacon = np.array(ibeacon)\n",
    "    waypoint = np.array(waypoint)\n",
    "\n",
    "    return ReadData(acce, acce_uncali, gyro, gyro_uncali, magn, magn_uncali, ahrs, wifi, ibeacon, waypoint)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:19.552943Z",
     "iopub.status.busy": "2021-02-19T07:32:19.551880Z",
     "iopub.status.idle": "2021-02-19T07:32:19.555636Z",
     "shell.execute_reply": "2021-02-19T07:32:19.555024Z"
    },
    "papermill": {
     "duration": 0.023563,
     "end_time": "2021-02-19T07:32:19.555795",
     "exception": false,
     "start_time": "2021-02-19T07:32:19.532232",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import dask\n",
    "from dask.distributed import Client, wait, LocalCluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:19.591248Z",
     "iopub.status.busy": "2021-02-19T07:32:19.590578Z",
     "iopub.status.idle": "2021-02-19T07:32:21.064629Z",
     "shell.execute_reply": "2021-02-19T07:32:21.065319Z"
    },
    "papermill": {
     "duration": 1.494609,
     "end_time": "2021-02-19T07:32:21.065605",
     "exception": false,
     "start_time": "2021-02-19T07:32:19.570996",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table style=\"border: 2px solid white;\">\n",
       "<tr>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Client</h3>\n",
       "<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
       "  <li><b>Scheduler: </b>tcp://127.0.0.1:40123</li>\n",
       "  <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a></li>\n",
       "</ul>\n",
       "</td>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Cluster</h3>\n",
       "<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
       "  <li><b>Workers: </b>2</li>\n",
       "  <li><b>Cores: </b>2</li>\n",
       "  <li><b>Memory: </b>17.18 GB</li>\n",
       "</ul>\n",
       "</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<Client: 'tcp://127.0.0.1:40123' processes=2 threads=2, memory=17.18 GB>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set n_workers to number of cores\n",
    "client = Client(n_workers=2, \n",
    "                threads_per_worker=1)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.015792,
     "end_time": "2021-02-19T07:32:21.097941",
     "exception": false,
     "start_time": "2021-02-19T07:32:21.082149",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### Label encode wifi ID features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:21.137129Z",
     "iopub.status.busy": "2021-02-19T07:32:21.136113Z",
     "iopub.status.idle": "2021-02-19T07:32:21.139703Z",
     "shell.execute_reply": "2021-02-19T07:32:21.139149Z"
    },
    "papermill": {
     "duration": 0.025247,
     "end_time": "2021-02-19T07:32:21.139872",
     "exception": false,
     "start_time": "2021-02-19T07:32:21.114625",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "PATH = '../input/indoor-location-navigation'\n",
    "OUT = '../input/wifi_lbl_encode'\n",
    "os.mkdir(OUT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:21.178354Z",
     "iopub.status.busy": "2021-02-19T07:32:21.177687Z",
     "iopub.status.idle": "2021-02-19T07:32:21.188733Z",
     "shell.execute_reply": "2021-02-19T07:32:21.187818Z"
    },
    "papermill": {
     "duration": 0.031758,
     "end_time": "2021-02-19T07:32:21.188900",
     "exception": false,
     "start_time": "2021-02-19T07:32:21.157142",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def read_wifi_ssid_bssid(name):\n",
    "    data = read_data_file(name)\n",
    "    dw = to_frame(data.wifi,'wifi')\n",
    "    ss = dw['ssid'].unique()\n",
    "    bs = dw['bssid'].unique()\n",
    "    return ss,bs\n",
    "\n",
    "def encode_wifi_ssid_bssid(name, lbl_ssid, lbl_bssid):\n",
    "    data = read_data_file(name)\n",
    "    dw = to_frame(data.wifi,'wifi')\n",
    "    dw['ssid'] = lbl_ssid.transform(dw['ssid'])\n",
    "    dw['bssid'] = lbl_bssid.transform(dw['bssid'])\n",
    "    name = name.replace(PATH,OUT)\n",
    "    dw.to_csv(name,index=False)\n",
    "    return dw"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.016428,
     "end_time": "2021-02-19T07:32:21.222099",
     "exception": false,
     "start_time": "2021-02-19T07:32:21.205671",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### Run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:21.258939Z",
     "iopub.status.busy": "2021-02-19T07:32:21.258260Z",
     "iopub.status.idle": "2021-02-19T07:32:25.615593Z",
     "shell.execute_reply": "2021-02-19T07:32:25.615024Z"
    },
    "papermill": {
     "duration": 4.377188,
     "end_time": "2021-02-19T07:32:25.615758",
     "exception": false,
     "start_time": "2021-02-19T07:32:21.238570",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(626, 26925)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_files = glob(f'{PATH}/train/*/*/*.txt')\n",
    "test_files = glob(f'{PATH}/test/*.txt')\n",
    "len(test_files),len(train_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:25.663419Z",
     "iopub.status.busy": "2021-02-19T07:32:25.662683Z",
     "iopub.status.idle": "2021-02-19T07:32:25.734282Z",
     "shell.execute_reply": "2021-02-19T07:32:25.733346Z"
    },
    "papermill": {
     "duration": 0.100222,
     "end_time": "2021-02-19T07:32:25.734466",
     "exception": false,
     "start_time": "2021-02-19T07:32:25.634244",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(numpy.ndarray,\n",
       " (19,),\n",
       " (19,),\n",
       " '../input/indoor-location-navigation/train/5cd56c0ce2acfd2d33b6ab27/B1/5d09a625bd54340008acddb9.txt')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "name = train_files[0]\n",
    "ss,bs = read_wifi_ssid_bssid(name)\n",
    "type(ss),ss.shape,bs.shape,name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:25.783302Z",
     "iopub.status.busy": "2021-02-19T07:32:25.782524Z",
     "iopub.status.idle": "2021-02-19T07:32:25.944817Z",
     "shell.execute_reply": "2021-02-19T07:32:25.945372Z"
    },
    "papermill": {
     "duration": 0.193007,
     "end_time": "2021-02-19T07:32:25.945570",
     "exception": false,
     "start_time": "2021-02-19T07:32:25.752563",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "os.mkdir(f'{OUT}/train')\n",
    "os.mkdir(f'{OUT}/test')\n",
    "for site in os.listdir(f'{PATH}/train'):\n",
    "    os.mkdir(f'{OUT}/train/{site}')\n",
    "    for floor in os.listdir(f'{PATH}/train/{site}'):\n",
    "        os.mkdir(f'{OUT}/train/{site}/{floor}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:25.987521Z",
     "iopub.status.busy": "2021-02-19T07:32:25.986482Z",
     "iopub.status.idle": "2021-02-19T07:32:26.098839Z",
     "shell.execute_reply": "2021-02-19T07:32:26.099376Z"
    },
    "papermill": {
     "duration": 0.135143,
     "end_time": "2021-02-19T07:32:26.099568",
     "exception": false,
     "start_time": "2021-02-19T07:32:25.964425",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 26925/26925 [00:00<00:00, 288117.37it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 97.7 ms, sys: 2.84 ms, total: 101 ms\n",
      "Wall time: 101 ms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "buildings = []\n",
    "floors = []\n",
    "used = []\n",
    "for fname in tqdm(train_files):\n",
    "    b,f = get_building_floor(fname)\n",
    "    f = get_floor_target(f)\n",
    "    if f is None:\n",
    "        continue\n",
    "    used.append(fname)\n",
    "    buildings.append(b)\n",
    "    floors.append(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T07:32:26.143381Z",
     "iopub.status.busy": "2021-02-19T07:32:26.142328Z",
     "iopub.status.idle": "2021-02-19T08:43:54.571192Z",
     "shell.execute_reply": "2021-02-19T08:43:54.571750Z"
    },
    "papermill": {
     "duration": 4288.452672,
     "end_time": "2021-02-19T08:43:54.572122",
     "exception": false,
     "start_time": "2021-02-19T07:32:26.119450",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 24732/24732 [00:11<00:00, 2222.39it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 39min 16s, sys: 1min 40s, total: 40min 57s\n",
      "Wall time: 1h 11min 28s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "futures = [] # save the future since dask is lazy, otherwise nothing is executed.\n",
    "for fname in tqdm(test_files+used):\n",
    "    f = client.submit(read_wifi_ssid_bssid,fname) \n",
    "    futures.append(f) \n",
    "_ = wait(futures)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T08:43:54.670582Z",
     "iopub.status.busy": "2021-02-19T08:43:54.669650Z",
     "iopub.status.idle": "2021-02-19T08:45:35.428441Z",
     "shell.execute_reply": "2021-02-19T08:45:35.427413Z"
    },
    "papermill": {
     "duration": 100.812271,
     "end_time": "2021-02-19T08:45:35.428636",
     "exception": false,
     "start_time": "2021-02-19T08:43:54.616365",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 24732/24732 [01:40<00:00, 246.25it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 9s, sys: 9.36 s, total: 1min 18s\n",
      "Wall time: 1min 40s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "((1536700,), (4858800,))"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ss = []\n",
    "bs = []\n",
    "for f in tqdm(futures):\n",
    "    s,b = f.result()\n",
    "    ss.append(s)\n",
    "    bs.append(b)\n",
    "ss = np.concatenate(ss)\n",
    "bs = np.concatenate(bs)\n",
    "ss.shape, bs.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T08:45:36.434012Z",
     "iopub.status.busy": "2021-02-19T08:45:36.423373Z",
     "iopub.status.idle": "2021-02-19T08:45:36.455609Z",
     "shell.execute_reply": "2021-02-19T08:45:36.454894Z"
    },
    "papermill": {
     "duration": 0.675897,
     "end_time": "2021-02-19T08:45:36.455784",
     "exception": false,
     "start_time": "2021-02-19T08:45:35.779887",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(74246,)\n",
      "CPU times: user 283 ms, sys: 6.93 ms, total: 290 ms\n",
      "Wall time: 289 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array(['0', '000015ed38fdbc763149432d7ba3b7bd208461d3',\n",
       "       '000073084baa0ea60776e25c07c6ee6b988aa072', ...,\n",
       "       'fffe51167f1ad1bf26dda45ccfc40b5d7fab8384',\n",
       "       'fffe9c57a9b25623ac219260c1b5155087a788e9',\n",
       "       'ffff286949cbddd576ed8b2d5e16ce30eab87af6'], dtype=object)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "lbl_ssid = LabelEncoder()\n",
    "lbl_ssid.fit(ss)\n",
    "print(lbl_ssid.classes_.shape)\n",
    "lbl_ssid.classes_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T08:45:38.090288Z",
     "iopub.status.busy": "2021-02-19T08:45:38.089396Z",
     "iopub.status.idle": "2021-02-19T08:45:38.127469Z",
     "shell.execute_reply": "2021-02-19T08:45:38.126844Z"
    },
    "papermill": {
     "duration": 1.317812,
     "end_time": "2021-02-19T08:45:38.127643",
     "exception": false,
     "start_time": "2021-02-19T08:45:36.809831",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(216210,)\n",
      "CPU times: user 948 ms, sys: 9.79 ms, total: 958 ms\n",
      "Wall time: 942 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array(['0', '00001d7b6fbf0a24da65285b686b03c6e796962a',\n",
       "       '0000fe40d201cfc6cada502b07f29883cd17fe4a', ...,\n",
       "       'ffff295a9e6ae90d9de67a1e1939ba969b765259',\n",
       "       'ffff2c098362b016764229a1bb5e06d10cf0d895',\n",
       "       'ffffb8116ceb5c0326ec2eb039028ec71ffdfbab'], dtype=object)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "lbl_bssid = LabelEncoder()\n",
    "lbl_bssid.fit(bs)\n",
    "print(lbl_bssid.classes_.shape)\n",
    "lbl_bssid.classes_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.349307,
     "end_time": "2021-02-19T08:45:38.841314",
     "exception": false,
     "start_time": "2021-02-19T08:45:38.492007",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Let's look at one encode result."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T08:45:39.579857Z",
     "iopub.status.busy": "2021-02-19T08:45:39.579079Z",
     "iopub.status.idle": "2021-02-19T08:45:39.910385Z",
     "shell.execute_reply": "2021-02-19T08:45:39.909623Z"
    },
    "papermill": {
     "duration": 0.720966,
     "end_time": "2021-02-19T08:45:39.910549",
     "exception": false,
     "start_time": "2021-02-19T08:45:39.189583",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>timestamp</th>\n",
       "      <th>ssid</th>\n",
       "      <th>bssid</th>\n",
       "      <th>rssi</th>\n",
       "      <th>last_timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1560913370116</td>\n",
       "      <td>54774</td>\n",
       "      <td>69149</td>\n",
       "      <td>-90</td>\n",
       "      <td>1560913363914</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1560913370116</td>\n",
       "      <td>18591</td>\n",
       "      <td>182228</td>\n",
       "      <td>-80</td>\n",
       "      <td>1560913363973</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1560913370116</td>\n",
       "      <td>54874</td>\n",
       "      <td>109727</td>\n",
       "      <td>-87</td>\n",
       "      <td>1560913354584</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1560913370116</td>\n",
       "      <td>36901</td>\n",
       "      <td>124351</td>\n",
       "      <td>-85</td>\n",
       "      <td>1560913351034</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1560913370116</td>\n",
       "      <td>51444</td>\n",
       "      <td>114314</td>\n",
       "      <td>-84</td>\n",
       "      <td>1560913363543</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       timestamp   ssid   bssid  rssi  last_timestamp\n",
       "0  1560913370116  54774   69149   -90   1560913363914\n",
       "1  1560913370116  18591  182228   -80   1560913363973\n",
       "2  1560913370116  54874  109727   -87   1560913354584\n",
       "3  1560913370116  36901  124351   -85   1560913351034\n",
       "4  1560913370116  51444  114314   -84   1560913363543"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encode_wifi_ssid_bssid(name, lbl_ssid, lbl_bssid)\n",
    "name = name.replace(PATH,OUT)\n",
    "df = pd.read_csv(name)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T08:45:40.737958Z",
     "iopub.status.busy": "2021-02-19T08:45:40.736888Z",
     "iopub.status.idle": "2021-02-19T08:45:43.340615Z",
     "shell.execute_reply": "2021-02-19T08:45:43.339718Z"
    },
    "papermill": {
     "duration": 3.060116,
     "end_time": "2021-02-19T08:45:43.340782",
     "exception": false,
     "start_time": "2021-02-19T08:45:40.280666",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 284 ms, sys: 34 ms, total: 317 ms\n",
      "Wall time: 2.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "lbl_ssid_f = client.scatter(lbl_ssid)\n",
    "lbl_bssid_f = client.scatter(lbl_bssid)\n",
    "_ = wait(lbl_ssid_f)\n",
    "_ = wait(lbl_bssid_f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-19T08:45:44.071507Z",
     "iopub.status.busy": "2021-02-19T08:45:44.070692Z",
     "iopub.status.idle": "2021-02-19T10:47:34.115015Z",
     "shell.execute_reply": "2021-02-19T10:47:34.115693Z"
    },
    "papermill": {
     "duration": 7310.41872,
     "end_time": "2021-02-19T10:47:34.116072",
     "exception": false,
     "start_time": "2021-02-19T08:45:43.697352",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 24732/24732 [00:17<00:00, 1415.46it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 54min 44s, sys: 2min 6s, total: 56min 50s\n",
      "Wall time: 2h 1min 49s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "futures = [] # save the future since dask is lazy, otherwise nothing is executed.\n",
    "\n",
    "for fname in tqdm(test_files+used):\n",
    "    f = client.submit(encode_wifi_ssid_bssid, fname, lbl_ssid_f, lbl_bssid_f)\n",
    "    futures.append(f) \n",
    "_ = wait(futures)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 11729.018326,
   "end_time": "2021-02-19T10:47:39.772795",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2021-02-19T07:32:10.754469",
   "version": "2.2.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
